{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1、 导入必要的工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "#评价指标为logloss\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.metrics import log_loss\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2、导入数据并确定训练数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pregnancies</th>\n",
       "      <th>Glucose</th>\n",
       "      <th>BloodPressure</th>\n",
       "      <th>SkinThickness</th>\n",
       "      <th>Insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>DiabetesPedigreeFunction</th>\n",
       "      <th>Age</th>\n",
       "      <th>Outcome</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \\\n",
       "0            6    148.0           72.0           35.0    125.0  33.6   \n",
       "1            1     85.0           66.0           29.0    125.0  26.6   \n",
       "2            8    183.0           64.0           29.0    125.0  23.3   \n",
       "3            1     89.0           66.0           23.0     94.0  28.1   \n",
       "4            0    137.0           40.0           35.0    168.0  43.1   \n",
       "\n",
       "   DiabetesPedigreeFunction  Age  Outcome  \n",
       "0                     0.627   50        1  \n",
       "1                     0.351   31        0  \n",
       "2                     0.672   32        1  \n",
       "3                     0.167   21        0  \n",
       "4                     2.288   33        1  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('FE_Diabetes.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train= train.drop(['Outcome'],axis=1)\n",
    "y_train = train['Outcome']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3、数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "ss_X = StandardScaler()\n",
    "#分别对训练和测试数据的特征进行标准化处理\n",
    "X_train = ss_X.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4、模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4.1 正则化的Logistic Regression 及使用neg_log_loss做评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最佳的分数  0.4760280199877287\n",
      "最佳的参数  {'penalty': 'l1', 'C': 1}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "penaltys = ['l1','l2']\n",
    "Cs = [0.001,0.01,0.1,1,10,100,1000]\n",
    "tuned_parameters = dict(penalty = penaltys,C = Cs)\n",
    "\n",
    "lr_penalty = LogisticRegression()\n",
    "grid = GridSearchCV(lr_penalty,tuned_parameters,cv=5,scoring='neg_log_loss')\n",
    "grid.fit(X_train,y_train)\n",
    "\n",
    "print '最佳的分数 ',-grid.best_score_\n",
    "print '最佳的参数 ',grid.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4.2 正则化的Logistic Regression 及使用换正确率做评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最佳的分数  0.7747395833333334\n",
      "最佳的参数  {'penalty': 'l2', 'C': 0.1}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "penaltys = ['l1','l2']\n",
    "Cs = [0.001,0.01,0.1,1,10,100,1000]\n",
    "tuned_parameters = dict(penalty = penaltys,C = Cs)\n",
    "\n",
    "lr_penalty = LogisticRegression()\n",
    "grid = GridSearchCV(lr_penalty,tuned_parameters,cv=5)\n",
    "grid.fit(X_train,y_train)\n",
    "\n",
    "print '最佳的分数 ',grid.best_score_\n",
    "print '最佳的参数 ',grid.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
